import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import pygeohash as gh
import geohash2
from fuzzywuzzy import process, fuzz
import geopy
from geopy.distance import geodesic
import time
import shap
import operator
import math
# load JS visualization code to notebook
shap.initjs()
import gc
#plot in a map
import folium
from folium.plugins import MarkerCluster
from folium import Choropleth, Circle, Marker
# Models
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor,\
ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
import lightgbm
from lightgbm import LGBMRegressor
# Model selection
from sklearn.feature_selection import VarianceThreshold
from sklearn import metrics
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, KFold
from sklearn.model_selection import cross_validate, ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# Hyperparameter tuning and optimization
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from hyperopt.pyll import scope
from sklearn.feature_selection import RFE, RFECV
#plot
import matplotlib.pyplot as plt
import seaborn as sns #Graph library that use matplot in background
# plotly libraries
import cufflinks as cf
cf.go_offline()
import plotly.offline as py
py.init_notebook_mode(connected=True) # this code, allow us to work with offline plotly version
import plotly.graph_objs as go # it's like "plt" of matplot
# these two lines allow your code to show up in a notebook
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
import widgetsnbextension
from IPython.display import display, HTML
# normality tests
from scipy import stats
#display full dataframe
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
'''Set a seed for reproducibility'''
seed = 1
'''Set path where all data files are located'''
path = 'C:/Users/Nikitas/Desktop/Nikitas/other_pet_projects/recognyte/'
'''Create load data function to easy import all necessary'''
def LoadData(path,file):
"""Load Data from defined directory.
Output is initial data dataframe."""
data = pd.read_csv(path + file + '.csv')
print ("Data is loaded!")
print ("Data: ",data.shape[0],"observations, and ",data.shape[1],"features")
#
return data
'''Create function for providing summary statistics in a table'''
def resumetable(df):
print(f"Dataset Shape: {df.shape}")
summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
summary = summary.reset_index()
summary['Name'] = summary['index']
summary = summary[['Name','dtypes']]
summary['Missing'] = df.isnull().sum().values
summary['Uniques'] = df.nunique().values
summary['Average'] = df.mean().values
summary['Median'] = df.median().values
summary['Maximum'] = df.max().values
summary['Minimum'] = df.min().values
summary['Standard Deviation'] = df.std().values
for name in summary['Name'].value_counts().index:
summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2)
return summary
'''Create function for data understanding to assess if there are features with missing values'''
def MissingData(df,thres):
"""
Define a dataframe with missing values and their
respective percentage and drop columns based on that percentage
param df:
dataframe to evaluate upon missing data
return:
return dataframe dropped columns decided by threshold
"""
total = df.isnull().sum().sort_values(ascending = False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total,percent], axis = 1, keys = ['Total', 'Percent'])
print(missing_data.head(20))
c=df.dropna(thresh=len(df)*thres, axis=1)
print('Threshold is: ' + str(thres))
print(f"We dropped {df.shape[1]-c.shape[1]} features in the combined set")
return c
'''Clean greek localites dataframe by splitting lat long to separate columns'''
def CleanGreekLocalities(df):
"""
param df:
dataframe to clean variable pnt
return:
return dataframe with separate Lat Long features
"""
df = df.copy()
df['pnt'] = df['pnt'].map(lambda x: x.strip('POINT'))
df['pnt'] = df['pnt'].str.replace(r')', '')
df['pnt'] = df['pnt'].str.replace(r'(', '')
df['pnt'] = df['pnt'].apply(lambda x: x.lstrip())
df[['Lat','Long']] = df['pnt'].apply(lambda x: pd.Series(str(x).split(" ")))
return df
'''Create an automated way to fix property feature'''
def PropertyFix(asking_data):
"""
Create a matching similarity algorithm which decides which of the possible names in prop_type feature need to be renamed to the closest names with higher percentage of popularity in the dataframe
param asking_data:
dataframe to clean variable prop_type
return:
return dataframe with separate Lat Long features
"""
unique_property = asking_data['prop_type'].unique().tolist()
#Create tuples names and the score
score_sort = [(x,) + i
for x in unique_property
for i in process.extract(x, unique_property, scorer=fuzz.token_sort_ratio)]
# mispelled property types appeared.
proptype_valcounts = asking_data['prop_type'].value_counts(normalize=True)
#rename columns properly
proptype_valcounts = proptype_valcounts.rename_axis('property').reset_index()
proptype_valcounts = proptype_valcounts.rename(columns={"prop_type": "percentage"})
#Create a dataframe from the tuples
similarity_sort = pd.DataFrame(score_sort, columns=['property','match_sort','score_sort'])
# merge with percentage
similarity_sort = pd.merge(similarity_sort,proptype_valcounts,on = 'property',how = 'left')
#
similarity_sort['sorted_property'] = np.minimum(similarity_sort['property'], similarity_sort['match_sort'])
similarity_sort['sorted_property'] = similarity_sort['sorted_property'].str.lower()
similarity_sort
#
high_score_sort = similarity_sort[(similarity_sort['score_sort'] >= 50) &((similarity_sort['percentage'] >= 0.01))]
# left join prop type to the proper name cleaned and lowered.
high_score_sort = high_score_sort[["match_sort","sorted_property"]].drop_duplicates()
high_score_sort = high_score_sort.rename(columns={"match_sort": "prop_type", "sorted_property": "property"})
# fix mispelled values.
asking_data = pd.merge(asking_data,high_score_sort, on = "prop_type", how = "left")
return asking_data
'''Convert all text fields containing numeric values to floats'''
def text_to_float(text):
"""Convert all cases where a numeric field is in text type and instead of . there is ,
Output is the field transformed in correct type ie float
"""
# find position for comma and dot and replace accordingly
# so that float can be applied to value
t = text
dot_pos = t.rfind('.')
comma_pos = t.rfind(',')
dash_pos = t.rfind('-')
lowdash_pos = t.rfind('_')
if comma_pos > dot_pos:
t = t.replace(".", "")
t = t.replace(",", ".")
elif dash_pos > dot_pos :
t = t.replace("-","")
elif lowdash_pos > dot_pos :
t = t.replace("_","")
else:
t = t.replace(",", "")
return(float(t))
'''Convert defined categorical columns to one hot encoded ones'''
def one_hot_encoder(df, ohe_cols=[]):
'''
One-Hot Encoder function
'''
print('Creating OHE features..\nOld df shape:{}'.format(df.shape))
df = pd.get_dummies(df, columns=ohe_cols)
print('New df shape:{}'.format(df.shape))
return df
'''Decode hash code to lat long coordinates using geohash library'''
def gh_decode(hash):
lat, lon = geohash2.decode(hash)
return pd.Series({"latitude":lat, "longitude":lon})
'''Calculate distance between two given locations with lat long coordinates'''
def calculate_distance(location, set_coord):
""" take two tuples of latitude and longitude and
calculate distance in km"""
locations = []
distance = geodesic(location, set_coord)
locations.append([distance.kilometers])
df = pd.DataFrame(locations, columns=['Distance'])
return df['Distance'][0]
'''Find closest point between a given location and a list of locations'''
def closest_point(location, df):
""" take a tuple of latitude and longitude and
compare to a dictonary of locations where
key = location name and value = (lat, long)
returns tuple of (closest_location , distance) """
locations = []
for i in range(len(df)):
print(i)
set_coord = {float(df['Lat'][i]),float(df['Long'][i])}
distance = geodesic(location, set_coord)
locations.append([df['Locality'][i],distance.kilometers])
df = pd.DataFrame(locations, columns=['Locality', 'Distance'])
df = df[df['Distance'] == min(df['Distance'])]
return df['Locality'].values,df['Distance'].values
'''All feature engineering steps happen inside this function'''
def Data_Preparation(df):
"""
param df:
main dataframe,asking_data, with all necessary features and target variable to develop predictive model.
return:
return main dataframe with target variable and features
"""
start = time.time()
#
df = PropertyFix(df)
#apply function to convert to float all pricing fields
df["price"] = df["price"].apply(text_to_float)
df["price_m2"] = df["price_m2"].apply(text_to_float)
#
# Create new var price_m2 x area
df["price new"] = df["price_m2"] * df["area"]
# keep only numeric values for bedroom feature
df['bedrooms'] = df['bedrooms'].str.replace(r'\D', '')
# create one hot encoding for special cases in floor feature
special_cases_floor = ['Ground floor', 'Mezzanine', 'Basement', 'Semi-basement']
#
for case in special_cases_floor:
df[case + ' ohe'] = np.where((df['floor'] == case), 1,0)
df['floor'] = np.where((df['floor'] == case), 1,df['floor'])
# impute all cases where floor = 0 to floor = 1 as there is no house without any floors even if it is ground floor
df['floor'] = np.where((df['floor'] == 0), 1,df['floor'])
# convert columns to numeric
df[["price", "price_m2","bedrooms", "floor"]] = df[["price", "price_m2","bedrooms", "floor"]].apply(pd.to_numeric)
# create features which will combine information from area , number of bedrooms and number of floors
#bedrooms per floor
df['bedrooms_per_floor'] = df['bedrooms'] / df['floor']
# area per floor
df['area_per_floor'] = df['area'] / df['floor']
# area per bedroom
df['area_per_bedroom'] = df['area'] / df['bedrooms']
#convert infinity values to na
df = df.replace(np.inf, np.nan)
# because some properties have no bedrooms need to impute calculated features with filling nas to 0
df = df.fillna(0)
# Convert geohash code to lat long
df['geohash_location'] = df.apply(lambda rec: gh.decode(rec['geohash']), axis=1)
df = df.join(df["geohash"].apply(gh_decode))
# define specific points of reference where distance will be calculated as a proxy for price of the property
athens = (37.983810, 23.727539)
skg = (40.629269, 22.947412)
crete_her = (35.341846, 25.148254)
crete_cha = (35.513828, 24.018038)
alexandr = (40.849136, 25.879326)
glyfada = (37.865044, 23.755045)
kifisia = (38.07438, 23.81106)
ioannina = (39.665028, 20.853746)
mykonos= (37.450001, 25.350000)
kalamata = (37.036598, 22.114401)
volos = (39.366669, 22.933332)
patra = (38.246639,21.734573)
komotini = (41.122440,25.406557)
nafplio = (37.56829, 22.80528)
kavala = (40.93959, 24.40687)
chalkidiki = (40.369500, 23.287085)
rodos = (36.1667,28.0000)
zakinthos = (37.7882,20.8988)
paros = (37.0453,25.2126)
argos = (37.642509,22.723160)
# calculate distance features
df['athens_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),athens), axis = 1)
df['skg_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),skg), axis = 1)
df['crete_her_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),crete_her), axis = 1)
df['crete_cha_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),crete_cha), axis = 1)
df['alexandr_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),alexandr), axis = 1)
df['glyfada_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),glyfada), axis = 1)
df['kifisia_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),kifisia), axis = 1)
df['ioannina_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),ioannina), axis = 1)
df['alexandr_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),alexandr), axis = 1)
df['mykonos_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),mykonos), axis = 1)
df['kalamata_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),kalamata), axis = 1)
df['volos_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),volos), axis = 1)
df['patra_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),patra), axis = 1)
df['komotini_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),komotini), axis = 1)
df['nafplio_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),nafplio), axis = 1)
df['kavala_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),kavala), axis = 1)
df['chalkidiki_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),chalkidiki), axis = 1)
df['rodos_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),rodos), axis = 1)
df['zakinthos_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),zakinthos), axis = 1)
df['paros_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),paros), axis = 1)
df['argos_distance'] = df.apply(lambda x: calculate_distance((x['latitude'],x['longitude']),argos), axis = 1)
# generate binary values using get_dummies
df = one_hot_encoder(df, ohe_cols=['property'])
# convert has parking to binary numeric variable
df['has_parking'] = np.where((df['has_parking'] == 'TRUE'), 1,0)
# create ohe for specific cases of cunstruction year feature and then convert to numeric
df['under construction ohe'] = np.where((df['construction_year'] == 'Under construction'), 1,0)
df['unknown ohe'] = np.where((df['construction_year'] == '-'), 1,0)
# fix outlier values for construction year so that feature will be imputed to median value
df['construction_year'] = np.where((df['construction_year'] == 'Under construction'), '-',df['construction_year'])
df['construction_year'] = np.where((df['construction_year'] == '-'), '',df['construction_year'])
df['construction_year'] = np.where((df['construction_year'] == ''), '0',df['construction_year'])
df['construction_year'] = df['construction_year'].apply(pd.to_numeric)
# if construction _year = 0 then replace it with nan value
df['construction_year'] = df['construction_year'].replace(0, np.nan)
# impute feature construction year with mean value for all those values where construction year is missing or it is not defined.
df['construction_year']= df['construction_year'].fillna(df['construction_year'].median())
# create variable number of years since building
df['house_age'] = 2021.0-df['construction_year']
# rename index to id
df = df.rename(columns={'Unnamed: 0': "id"})
#log transform the target:
df['log_price_m2'] = np.log1p(df['price_m2'])
# Because we have applied log transformation to make the target distribution more normal, we will delete those observations which are more than 3 standard deviations from the mean.
om_y = out_iqr(df['log_price_m2'], k=3.0)
# create Pandas Series with define indexes
s2= pd.Series(om_y)
df = df[s2.values]
# print new length of df
print("New df has length: ")
print(len(df))
# select certain columns
df = df[['id','price_m2','log_price_m2',
'latitude','longitude',
'area',
'bedrooms',
'floor',
'bedrooms_per_floor',
'area_per_floor',
'area_per_bedroom',
'has_parking',
'house_age',
'construction_year',
'Ground floor ohe',
'Mezzanine ohe',
'Basement ohe',
'Semi-basement ohe',
'athens_distance',
'skg_distance',
'crete_her_distance',
'crete_cha_distance',
'alexandr_distance',
'glyfada_distance',
'ioannina_distance',
'mykonos_distance',
'kalamata_distance',
'volos_distance',
'patra_distance',
'komotini_distance',
'nafplio_distance',
'kavala_distance',
'chalkidiki_distance',
'rodos_distance',
'zakinthos_distance',
'paros_distance',
'argos_distance',
'property_apartment',
'prop_type',
'property_studio flat',
'property_villa',
'under construction ohe',
'unknown ohe']].drop_duplicates()
end = time.time()
print("Total Time for Data Preparation is: ")
print((end - start)/60)
return df
'''Choose certain columns for fitting ML model'''
def ChooseColumns(df):
# select certain columns
df = df[['id','log_price_m2',
'area',
'bedrooms',
'floor',
'bedrooms_per_floor',
'area_per_floor',
'area_per_bedroom',
'has_parking',
'house_age',
'construction_year',
'Ground floor ohe',
'Mezzanine ohe',
'Basement ohe',
'Semi-basement ohe',
'athens_distance',
'skg_distance',
'crete_her_distance',
'crete_cha_distance',
'alexandr_distance',
'glyfada_distance',
'ioannina_distance',
'mykonos_distance',
'kalamata_distance',
'volos_distance',
'patra_distance',
'komotini_distance',
'nafplio_distance',
'kavala_distance',
'chalkidiki_distance',
'rodos_distance',
'zakinthos_distance',
'paros_distance',
'argos_distance',
'property_apartment',
'property_studio flat',
'property_villa',
'under construction ohe',
'unknown ohe']].drop_duplicates()
return df
'''Convert main dataframe to train test split ready to train/validate model'''
def ModelCreation(df,testsize,target,selected_features):
"""
Return 4 dataframes for train and test
param df:
dataframe to split into train test
return:
return 4 frames
"""
#set x and y
y=df[target].values
x=df.drop([target,'id'], axis=1)
x= x[selected_features]
# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size = testsize, random_state=42)
# Robust Scaling transform
pipeline = Pipeline([
('robust', RobustScaler())
])
x_train = pipeline.fit_transform(x_train)
x_test = pipeline.transform(x_test)
return x_train, x_test, y_train, y_test
'''Shuffle data and return cross validation scores'''
def cv_results_print(model,X_train,y_train,test_size,seed):
"""
Return 4 dataframes for train and test
param model:
model to perform cross validation and output results
param X_train:
X dataframe
param y_train:
y dataframe
param test_size:
size for test cross validation
return:
return rmse and mae
"""
start = time.time()
cv = ShuffleSplit(n_splits=10, test_size=test_size, random_state=seed)
results_rmse = cross_val_score(model, X_train, y_train, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
results_mae = cross_val_score(model, X_train, y_train, cv=cv, n_jobs=-1, scoring='neg_mean_absolute_error')
rmse = np.sqrt(-results_rmse).mean()
mae = np.abs(-results_mae).mean()
model_name = model.__class__.__name__
model_params = model.get_params()
print('Model: {}'.format(model_name))
print('Model Parameters: {}'.format(model_params))
print('5 Fold CV RMSE: {:.4f}'.format(rmse))
print('5 Fold CV MAE: {:.4f}'.format(mae))
print('Computation Time: {:.2f}'.format(time.time() - start))
return rmse,mae
def out_std(s, nstd=3.0, return_thresholds=False):
"""
Return a boolean mask of outliers for a series
using standard deviation, works column-wise.
param nstd:
Set number of standard deviations from the mean
to consider an outlier
:type nstd: ``float``
param return_thresholds:
True returns the lower and upper bounds, good for plotting.
False returns the masked array
:type return_thresholds: ``bool``
"""
data_mean, data_std = s.mean(), s.std()
cut_off = data_std * nstd
lower, upper = data_mean - cut_off, data_mean + cut_off
if return_thresholds:
return lower, upper
else:
return [False if x < lower or x > upper else True for x in s]
def out_iqr(s, k=1.5, return_thresholds=False):
"""
Return a boolean mask of outliers for a series
using interquartile range, works column-wise.
param k:
some cutoff to multiply by the iqr
:type k: ``float``
param return_thresholds:
True returns the lower and upper bounds, good for plotting.
False returns the masked array
:type return_thresholds: ``bool``
"""
# calculate interquartile range
q25, q75 = np.percentile(s, 25), np.percentile(s, 75)
iqr = q75 - q25
# calculate the outlier cutoff
cut_off = iqr * k
lower, upper = q25 - cut_off, q75 + cut_off
if return_thresholds:
return lower, upper
else: # identify outliers
return [False if x < lower or x > upper else True for x in s]
def CheckDistribution(df,feature):
"""
Return a describe table with main statistics and distribution plot for the given feature.
param df:
Set dataframe where feature is located.
param feature:
Given feature which will be checked upon distribution and main statistics.
"""
print(df[feature].describe())
plt.figure(figsize=(9, 8))
sns.distplot(df[feature], color='g', bins=100, hist_kws={'alpha': 0.4});
def CheckNormality(df,feature):
"""
Return the output of the normality test of Wilk-Shapiro.
param df:
Set dataframe where feature is located.
param feature:
Given feature which will be checked upon the normality test.
"""
stat, p = stats.shapiro(df[feature])
print("Shapiro stat:", stat)
print("P-value: ", p)
if p >= .01:
print('Normal Distribution')
else:
print("Non-Normal Distribution")
def remove_collinear_features(x, threshold):
'''
Objective:
Remove collinear features in a dataframe with a correlation coefficient
greater than the threshold. Removing collinear features can help a model
to generalize and improves the interpretability of the model.
Inputs:
x: features dataframe
threshold: features with correlations greater than this value are removed
Output:
dataframe that contains only the non-highly-collinear features
'''
# Calculate the correlation matrix
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterate through the correlation matrix and compare correlations
for i in iters:
for j in range(i+1):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = abs(item.values)
# If correlation exceeds the threshold
if val >= threshold:
# Print the correlated features and the correlation value
#print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drop_cols.append(col.values[0])
# Drop one of each pair of correlated columns
drops = set(drop_cols)
x = x.drop(columns=drops)
print('Removed Columns {}'.format(drops))
return x
#dataframe from upper step and number of best variables to output
def variableselection_continuous(df,best,target,features_to_select):
start = time.time()
def XyCreationCont(df):
y = df[target].ravel()
y = np.array(y).astype(float)
X = pd.DataFrame(df.drop([target,'id'], axis=1))
return X,y
X,y = XyCreationCont(df)
# =============================================================================
# #Remove zero variance features
# =============================================================================
print('Remove Zero Variance Features..')
sel = VarianceThreshold(threshold=0)
sel.fit(X) # fit finds the features with zero variance
zerovar_support = sel.get_support()
# =============================================================================
# #Remove missing
# =============================================================================
print('Remove Missing..')
for col in X.columns:
if X[col].isnull().sum()/X.shape[0] > 0.9:
print(col)
del X[col]
# =============================================================================
# #Remove low variance features
# =============================================================================
for col in X.columns:
if X[col].nunique()==1:
print(col)
del X[col]
# =============================================================================
# #Pearson Correlation
# =============================================================================
print('Pearson Correlation..')
def cor_selector(X, y):
cor_list = []
# calculate the correlation with y for each feature
for i in X.columns.tolist():
cor = np.corrcoef(X[i], y)[0, 1]
cor_list.append(cor)
# replace NaN with 0
cor_list = [0 if np.isnan(i) else i for i in cor_list]
# feature name
cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-features_to_select:]].columns.tolist()
# feature selection? 0 for not select, 1 for select
cor_support = [True if i in cor_feature else False for i in feature_name]
return cor_support, cor_feature
feature_name = X.columns.tolist()
cor_support, cor_feature = cor_selector(X, y)
# =============================================================================
# #Wrapper RFE selector
# =============================================================================
print('RFE selector..')
X_norm = MinMaxScaler().fit_transform(X)
rfe_selector = RFE(estimator=LinearRegression(), n_features_to_select=features_to_select, step=120, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
# =============================================================================
# #Light GBM
# =============================================================================
print('Light GBM ..')
lgbc=LGBMRegressor(boosting_type='gbdt',
num_leaves=31,
max_depth=6,
learning_rate=0.08,
n_estimators=300, colsample_bytree=0.2,
reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=30)
embeded_lgb_selector = SelectFromModel(lgbc)
embeded_lgb_selector.fit(X, y)
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')
# =============================================================================
# #Select top x features via voting mechanism
# =============================================================================
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Zero Variance':zerovar_support, 'Pearson':cor_support, 'RFE':rfe_support,
'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
best_features = feature_selection_df.head(best)
best_features = list(best_features['Feature'])
# pabrint best features
print(best_features)
Xbest = X[best_features]
# remove uncorrelated features
Xbest = remove_collinear_features(Xbest,0.85)
end = time.time()
print((end - start)/60)
return Xbest
'''Function to plot scatter plot'''
def scatter_plot(x, y, title, xaxis, yaxis, size, c_scale):
trace = go.Scatter(
x = x,
y = y,
mode = 'markers',
marker = dict(color = y, size = size, showscale = True, colorscale = c_scale))
layout = go.Layout(hovermode= 'closest', title = title, xaxis = dict(title = xaxis), yaxis = dict(title = yaxis))
fig = go.Figure(data = [trace], layout = layout)
return iplot(fig)
Dataframe has a couple of issues which need either data cleaning or imputation in order to produce value out of the dataframe. A couple of the underlined issues can be found as follows:
#Load data
asking_data = LoadData(path,'asking_data')
#Clean and Prepare data
prepared_data = Data_Preparation(asking_data)
prepared_data.info()
prepared_data_num = prepared_data.select_dtypes(include = ['float64','int64','uint8','int32'])
resumetable(prepared_data_num)
CheckDistribution(prepared_data,'price_m2')
CheckNormality(prepared_data,'price_m2')
CheckDistribution(prepared_data,'log_price_m2')
CheckNormality(prepared_data,'log_price_m2')
Lets plot them all
prepared_data_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8); # ; avoid having the matplotlib verbose informations
corr = prepared_data_num.drop('log_price_m2', axis=1).corr() # We already examined SalePrice correlations
plt.figure(figsize=(20, 16))
sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)],
cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
annot=True, annot_kws={"size": 8}, square=True);
Below we check correlation for every numeric variable with target from lowest to highest.
individual_features_df = []
for i in range(0, len(prepared_data_num.columns) - 3): # -1 because the last column is SalePrice
tmpDf = prepared_data_num[[prepared_data_num.columns[i+3], 'log_price_m2']]
tmpDf = tmpDf[tmpDf[prepared_data_num.columns[i+3]] != 0]
individual_features_df.append(tmpDf)
all_correlations = {feature.columns[0]: feature.corr()['log_price_m2'][0] for feature in individual_features_df}
all_correlations = sorted(all_correlations.items(), key=operator.itemgetter(1))
for (key, value) in all_correlations:
print("{:>15}: {:>15}".format(key, value))
We will choose only features with high correlation (>30%) and plot against the target so that we can gain more insights.
corr_features_list = [key for key, value in all_correlations if abs(value) >= 0.3]
print("There is {} strongly correlated values with price_m2:\n{}".format(len(corr_features_list), corr_features_list))
fig, ax = plt.subplots(round(len(corr_features_list) / 3), 3, figsize = (25, 18))
for i, ax in enumerate(fig.axes):
if i < len(corr_features_list) - 1:
sns.regplot(x=corr_features_list[i],y=prepared_data_num['log_price_m2'], data=prepared_data_num[corr_features_list], ax=ax)
# Suplots of categorical features v price
sns.set_style('darkgrid')
f, axes = plt.subplots(3,2, figsize = (15,15))
# Plot [0,0]
sns.boxplot(data = prepared_data, x = 'floor', y = 'log_price_m2', ax = axes[0, 0])
axes[0,0].set_xlabel('floor')
axes[0,0].set_ylabel('Price')
axes[0,0].set_title('Floor & Price')
# Plot [0,1]
sns.boxplot(data = prepared_data, x = 'property_studio flat', y = 'log_price_m2', ax = axes[0, 1])
axes[0,1].set_xlabel('Studio')
axes[0,1].set_ylabel('Price')
axes[0,1].set_title('Studio & Price')
# Plot [1,0]
sns.boxplot(x = 'property_villa', y = 'log_price_m2', data = prepared_data, ax = axes[1,0])
axes[1,0].set_xlabel('Villa')
axes[1,0].set_ylabel('Price')
axes[1,0].set_title('Villa & Price')
# Plot [1,1]
sns.boxplot(x = 'property_apartment', y = 'log_price_m2', data = prepared_data, ax = axes[1,1])
axes[1,1].set_xlabel('Apartment')
axes[1,1].set_ylabel('Price')
axes[1,1].set_title('Apartment & Price')
# Plot [2,0]
sns.boxplot(x = 'Ground floor ohe', y = 'log_price_m2', data = prepared_data, ax = axes[2,0])
axes[2,0].set_xlabel('Ground floor')
axes[2,0].set_ylabel('Price')
axes[2,0].set_title('Ground floor & Price')
# Plot [2,1]
sns.boxplot(x = 'Basement ohe', y = 'log_price_m2', data = prepared_data, ax = axes[2,1])
axes[2,1].set_xlabel('Basement')
axes[2,1].set_ylabel('Price')
axes[2,1].set_title('Basement & Price')
prepared_data['latitude'] = prepared_data['latitude'].astype(float)
prepared_data['longitude'] = prepared_data['longitude'].astype(float)
# Create the map
m_3 = folium.Map(location=[40.93605,24.413038], tiles='cartodbpositron', zoom_start=13)
# Add points to the map
mc = MarkerCluster()
for idx, row in prepared_data.iterrows():
if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):
mc.add_child(Marker([row['latitude'], row['longitude']]))
m_3.add_child(mc)
# Display the map
m_3
Over the following flow we load data, clean, manipulate, feature engineer, choose subset of variables and create a dataset ready to produce a machine learning model.
#Load data
asking_data = LoadData(path,'asking_data')
greek_localities = LoadData(path,'greek_localities')
#Clean and Prepare data
greek_localities = CleanGreekLocalities(greek_localities)
prepared_data = Data_Preparation(asking_data)
prepared_data = ChooseColumns(prepared_data)
#Variable Selection
prepared_data_best = variableselection_continuous(prepared_data,28,'log_price_m2',20)
#keep those features only
feat_list= list(prepared_data_best.columns)
# Split train test
x_train, x_test, y_train, y_test = ModelCreation(prepared_data,0.1,'log_price_m2',feat_list)
'''We are interested in the following 13 regression models.
All initialized with default parameters except random_state and n_jobs.'''
linear = LinearRegression(n_jobs = -1)
lasso = Lasso(random_state = seed)
ridge = Ridge(random_state = seed)
elnt = ElasticNet(random_state = seed)
dt = DecisionTreeRegressor(random_state = seed)
svm = SVR()
knn = KNeighborsRegressor(n_jobs = -1)
rf = RandomForestRegressor(n_jobs = -1, random_state = seed)
et = ExtraTreesRegressor(n_jobs = -1, random_state = seed)
ab = AdaBoostRegressor(random_state = seed)
gb = GradientBoostingRegressor(random_state = seed)
xgb = XGBRegressor(random_state = seed, n_jobs = -1)
lgb = LGBMRegressor(random_state = seed, n_jobs = -1)
'''Calculate train_test_split score of differnt models and plot them.'''
models = [linear,lasso, ridge, elnt, dt, svm, knn, rf, et, ab, gb, xgb, lgb]
train_test_split_rmse = []
for model in models:
train_test_split_rmse.append(cv_results_print(model, x_train, y_train, 0.10,seed))
'''Plot data frame of train test rmse'''
train_test_score = pd.DataFrame(data = train_test_split_rmse, columns = ['Train_Test_RMSE','Train_Test_MAE'])
train_test_score.index = ['LN', 'LASS','RID', 'ELNT', 'DT', 'SVM', 'KNN', 'RF', 'ET', 'AB', 'GB', 'XGB', 'LGB']
train_test_score = train_test_score.round(5)
x = train_test_score.index
y = train_test_score['Train_Test_RMSE']
title = "Models' Test Score (RMSE) on Holdout(15%) Set"
scatter_plot(x, y, title, 'Models','RMSE', 25, 'RdBu')
By comparing all models based on RMSE (Root Mean Squared Error) we see that the best model is produced with LightGBM.
y = train_test_score['Train_Test_MAE']
title = "Models' Test Score (MAE) on Holdout(15%) Set"
scatter_plot(x, y, title, 'Models','MAE', 25, 'RdBu')
By comparing all models based on MAE (Mean Absolute Error) we see that the best model is produced with Extra Trees Regressor algorithm.
feat_list
Best model is LightGBM, so we will use hyperparameter tuning to find this space of parameters which produce the best possible fit for the data.
In order to ensure that we produced the best possible version of lightgbm and have the fit that minimizes the residuals we need to search over the hyperparameter space and find those minima which accomplish this goal.
# Define searched space
hyper_space = {'objective': 'regression',
'metric':'rmse',
'boosting':'gbdt', 'gpu_device_id': 0,
'n_estimators': hp.choice('n_estimators', [25, 40, 50, 75, 100, 250, 500]),
'max_depth': hp.choice('max_depth', list(range(6, 18, 2))),
'num_leaves': hp.choice('num_leaves', list(range(20, 180, 20))),
'subsample': hp.choice('subsample', [.7, .8, .9, 1]),
'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1),
'learning_rate': hp.uniform('learning_rate', 0.03, 0.12),
'reg_alpha': hp.choice('reg_alpha', [.1, .2, .3, .4, .5, .6]),
'reg_lambda': hp.choice('reg_lambda', [.1, .2, .3, .4, .5, .6]),
'min_child_samples': hp.choice('min_child_samples', [20, 45, 70, 100])}
# Seting the number of Evals
MAX_EVALS= 10
def evaluate_metric(params):
clf = LGBMRegressor(**params)
cv = ShuffleSplit(n_splits=10, test_size=0.10, random_state=seed)
score = cross_val_score(clf, x_train, y_train, cv=cv, scoring='neg_mean_squared_error')
return {'loss':np.sqrt(-score).mean(), 'status':STATUS_OK}
# Fit Tree Parzen Estimator
best_vals = fmin(evaluate_metric,
space=hyper_space,
verbose=-1,
algo=tpe.suggest,
max_evals=MAX_EVALS)
# Print best parameters
best_params = space_eval(hyper_space, best_vals)
Next we have the optimum hyperparameters based on the search in the hyperparameter space by iteratively fitting models and decide based on minimum rmse.
best_params
clf_best = LGBMRegressor(**best_params)
clf_best.fit(x_train, y_train)
cv_results_print(clf_best, x_train, y_train,0.10,seed)
By checking the fit for chosen parameters we observe that better performance is achieved both for rmse and mae.
# predict the results
y_pred=clf_best.predict(x_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_absolute_error(y_test, y_pred))
y_pred_true = np.exp(y_pred)
y_test_true = np.exp(y_test)
print(np.sqrt(mean_squared_error(y_test_true, y_pred_true)))
print(mean_absolute_error(y_test_true, y_pred_true))
plt.figure(figsize=(10,10))
plt.scatter(y_test, y_pred, c='crimson')
plt.yscale('log')
plt.xscale('log')
p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()
We write into a dataframe the test sample to check the performance based on the print_metrics.py.
df_results = pd.DataFrame({'actual_price_m2':y_test_true, 'predicted_price_m2':y_pred_true})
df_results
df_results.to_csv('C:/Users/Nikitas/Desktop/Nikitas/other_pet_projects/recognyte/df_results.csv', index=False)
#create df so that we can observe features names too
x_test_df = pd.DataFrame(x_test,columns=feat_list)
%time shap_values = shap.TreeExplainer(clf_best).shap_values(x_test_df)
import shap
%time shap_values = shap.TreeExplainer(clf_best).shap_values(x_test)
shap.summary_plot(shap_values, x_test_df, plot_type="bar", color='blue')
This feature importance representation shows how if we color each dot by the value of the original feature we can see how a low feature value or high feature value effects the model output. For example crete_her_distance is the most important feature and it seems that low values (blue colored dots) have higher shap values whereas higher values have lower shap values. This can be translated that places near Crete have higher prices whereas places away from crete have lower. Some features may not have a global impact like property_villa but have a significant high impact for a particular subset, which can be translated into that properties of type villa have significantly higher prices.
shap.summary_plot(shap_values, x_test_df)
Here you can scroll through the top features and see lots of interesting patterns and interactions. Like how newer houses have parking spots than older ones (see the house_age - has_parking plot).
for i in reversed(inds2):
shap.dependence_plot(i, shap_values, x_test_df)
Features pushing the prediction higher than the base value are shown in red, those pushing the prediction lower are in blue.
explainer = shap.TreeExplainer(clf_best)
shap.force_plot(explainer.expected_value, shap_values[3,:], x_test_df.iloc[3,:])
asking_data[asking_data['Unnamed: 0'] == 351]
prepared_data[prepared_data['log_price_m2'] == y_test[3]]